In [ ]:
import nltk
nltk.download('gutenberg')
# nltk.download('maxent_treebank_pos_tagger')
In [ ]:
from nltk.corpus import gutenberg
# 저장되어 있는 데이터 로드 및 파일 제목 확인
gutenberg_files = gutenberg.fileids()
gutenberg_files
In [ ]:
# 특정 텍스트 확인
gutenberg_doc = gutenberg.open('austen-emma.txt').read()
print(gutenberg_doc[:200])
In [ ]:
# tokenize - 띄어쓰기 기준으로 단어를 분리하여 list 형태로 저장
sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
tokens
In [ ]:
# Pos tagging - token 단위로 Pos를 추가하여 tuple - list 형태로 저장
tagged = nltk.pos_tag(tokens)
tagged
for word in tagged:
print(word)
In [ ]:
for word in tagged:
if word[1][0] == 'N':
print(word[0].lower()+'/'+word[1])
# 같은 표현 = list comprehension
tagged_word = [word[0].lower()+'/'+word[1] for word in tagged if word[1][0] == 'N']
In [ ]:
# 결과 확인
tagged_word
https://www.cis.upenn.edu/~treebank/
CC Coordinating conjunction
CD Cardinal number
DT Determiner
EX Existential there
FW Foreign word
IN Preposition or subordinating conjunction
JJ Adjective
JJR Adjective, comparative
JJS Adjective, superlative
LS List item marker
MD Modal
NN Noun, singular or mass
NNS Noun, plural
NNP Proper noun, singular
NNPS Proper noun, plural
PDT Predeterminer
POS Possessive ending
PRP Personal pronoun
PRP$ Possessive pronoun
RB Adverb
RBR Adverb, comparative
RBS Adverb, superlative
RP Particle
SYM Symbol
TO to
UH Interjection
VB Verb, base form
VBD Verb, past tense
VBG Verb, gerund or present participle
VBN Verb, past participle
VBP Verb, non3rd person singular present
VBZ Verb, 3rd person singular present
WDT Whdeterminer
WP Whpronoun
WP$ Possessive whpronoun
WRB Whadverb
In [ ]:
gutenberg_tokens = nltk.word_tokenize(gutenberg_doc)
gutenberg_tagged = nltk.pos_tag(gutenberg_tokens)
In [ ]:
# 시간을 체크하고 싶다면 time library를 사용해보자
import time
start_time = time.time()
## 실행 코드
gutenberg_tokens = nltk.word_tokenize(gutenberg_doc)
gutenberg_tagged = nltk.pos_tag(gutenberg_tokens)
print("Processed time = ",(time.time() - start_time))
In [ ]:
gutenberg_tokens[:10]
In [ ]:
gutenberg_tagged[:10]
In [ ]:
gutenberg_doc = gutenberg.open('austen-sense.txt').read()
gutenberg_tokens = nltk.word_tokenize(gutenberg_doc)
gutenberg_tagged = nltk.pos_tag(gutenberg_tokens)
In [ ]:
gutenberg_tokens[:10]
In [ ]:
gutenberg_tagged[:10]
단어의 어근을 추출하기 위해 stemming!
Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.
The word "meeting" can be either the base form of a noun or a form of a verb ("to meet") depending on the context, e.g., "in our last meeting" or "We are meeting again tomorrow". Unlike stemming, lemmatisation can in principle select the appropriate lemma depending on the context.
In [ ]:
# lemmatization
lemma = nltk.wordnet.WordNetLemmatizer()
gutenberg_lemma = []
# 분리한 token에 대하여 nltk lemmatizing 하고 그 결과를 lemma list에 추가
for token in gutenberg_tokens:
gutenberg_lemma.append(lemma.lemmatize(token))
gutenberg_lemma[:20]
In [ ]:
# tokenizing -> lemmatizing -> PoS tagging
gutenberg_lemma_tagged = nltk.pos_tag(gutenberg_lemma)
gutenberg_lemma_tagged[:20]
In [ ]:
# stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
gutenberg_stemmed = []
for token in gutenberg_tokens:
gutenberg_stemmed.append(porter_stemmer.stem(token))
gutenberg_stemmed[:20]
In [ ]:
start_time = time.time()
gutenberg_stemmed_tagged = nltk.pos_tag(gutenberg_stemmed)
print(time.time() - start_time)
#gutenberg_stemmed_tagged
In [ ]:
# compare Stemming & Lemmatization
print(porter_stemmer.stem('running'))
print(lemma.lemmatize('running'))
In [ ]:
import collections # token으로 나눠진 데이터를 딕셔너리 형태로 변환 + 편리한 함수를 제공하는 library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # 결과를 시각화 하기 위한 matplotlib
%matplotlib inline
# token - frequency 형태로 변환 : collections.Counter 함수!
print(collections.Counter(gutenberg_stemmed_tagged).most_common(50))
In [ ]:
# tuple 형태로 저장되어 있는 데이터를 token, frequency로 나눠서 저장
token_list = []
freq_list = []
for token, freq in collections.Counter(gutenberg_stemmed_tagged).most_common(10):
token_list.append(token)
freq_list.append(freq)
print(token_list[:4])
print(freq_list[:4])
In [ ]:
# list로 나눈 데이터를 pandas 형태로 저장
data = pd.concat([pd.DataFrame(token_list),pd.DataFrame(freq_list)], axis=1)
data.columns = ['word','tag','freq']
data.head()
In [ ]:
# word index 대신 word를 보여주는 그래프
freqdist = nltk.FreqDist(gutenberg_lemma_tagged)
freqdist.plot(50)
freqdist.plot(50,cumulative=True)
In [ ]:
# nltk에서 제공되는 stop word 사용
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
for index, ele in enumerate(stop_words):
if index<20:
print(index,ele)
In [ ]:
# domain specific stop-words를 update하여 사용할 수 도 있음
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
# 대소문자가 다르게 인식되기 때문에 lowercase로 변환하여 사용
filtered_words = [word[0].lower() for word in gutenberg_lemma_tagged if word[0].lower() not in stop_words]
filtered_tag = [word[1].lower() for word in gutenberg_lemma_tagged if word[0].lower() not in stop_words]
filtered_words[:10]
In [ ]:
# stop word list에 'mr.' 추가하면 없어짐
freqdist = nltk.FreqDist(filtered_words)
freqdist.plot(50)
freqdist.plot(50,cumulative=True)
In [ ]:
# Dataframe 설정
result = pd.DataFrame()
result['filtered_word'] = filtered_words
result['filtered_freq'] = filtered_tag
# csv 형태로 저장
result.to_csv("filtered_word.csv")